import pandas as pd
import numpy as np
import seaborn as sns
import os
import matplotlib.pyplot as plt
import glob
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
lat = 'latitude'
lng = 'longitude'
sid = 'station_id'
k_nn = 3
# Air Quality files
air_quality_201701_201801 = pd.read_csv('D:/Akhila/Air pollution data set/beijing_17_18_aq.csv')
air_quality_201802_201803 = pd.read_csv('D:/Akhila/Air pollution data set/beijing_201802_201803_aq.csv')
# Grid Weather
grid_station_data = pd.read_csv('D:/Akhila/Air pollution data set/Beijing_historical_meo_grid.csv')
# Observed Weather
observed_station_data = pd.read_csv("D:/Akhila/Air pollution data set/beijing_17_18_meo.csv")
# Locations
air_quality_stations = pd.read_csv("D:/Akhila/Air pollution data set/airQuality_station.csv", index_col=sid)
grid_weather_stations = pd.read_csv('D:/Akhila/Air pollution data set/Beijing_grid_weather_station.csv', names=[sid, lat, lng], index_col=0)
observed_weather_stations = pd.read_csv("D:/Akhila/Air pollution data set/observed_weather_stations.csv", index_col=sid)
def sortAndUniqueTime(time_df):
# sort & display time
unique_time = time_df.unique()
sort_time = pd.to_datetime(unique_time).sort_values()
return sort_time
#Air quality tables
display(air_quality_201701_201801.head(3))
display(air_quality_201701_201801.shape)
display(sortAndUniqueTime(air_quality_201701_201801.utc_time))
display(air_quality_201802_201803.head(3))
display(air_quality_201802_201803.shape)
display(sortAndUniqueTime(air_quality_201802_201803.utc_time))
| stationId | utc_time | PM2.5 | PM10 | NO2 | CO | O3 | SO2 | |
|---|---|---|---|---|---|---|---|---|
| 0 | aotizhongxin_aq | 2017-01-01 14:00:00 | 453.0 | 467.0 | 156.0 | 7.2 | 3.0 | 9.0 |
| 1 | aotizhongxin_aq | 2017-01-01 15:00:00 | 417.0 | 443.0 | 143.0 | 6.8 | 2.0 | 8.0 |
| 2 | aotizhongxin_aq | 2017-01-01 16:00:00 | 395.0 | 467.0 | 141.0 | 6.9 | 3.0 | 8.0 |
(311010, 8)
DatetimeIndex(['2017-01-01 14:00:00', '2017-01-01 15:00:00',
'2017-01-01 16:00:00', '2017-01-01 17:00:00',
'2017-01-01 18:00:00', '2017-01-01 19:00:00',
'2017-01-01 20:00:00', '2017-01-01 21:00:00',
'2017-01-01 22:00:00', '2017-01-01 23:00:00',
...
'2018-01-31 06:00:00', '2018-01-31 07:00:00',
'2018-01-31 08:00:00', '2018-01-31 09:00:00',
'2018-01-31 10:00:00', '2018-01-31 11:00:00',
'2018-01-31 12:00:00', '2018-01-31 13:00:00',
'2018-01-31 14:00:00', '2018-01-31 15:00:00'],
dtype='datetime64[ns]', length=8701, freq=None)
| stationId | utc_time | PM2.5 | PM10 | NO2 | CO | O3 | SO2 | |
|---|---|---|---|---|---|---|---|---|
| 0 | aotizhongxin_aq | 2018-01-31 16:00:00 | 49.0 | 82.0 | 90.0 | 0.9 | 6.0 | 10.0 |
| 1 | aotizhongxin_aq | 2018-01-31 17:00:00 | 47.0 | 80.0 | 90.0 | 0.9 | 5.0 | 10.0 |
| 2 | aotizhongxin_aq | 2018-01-31 18:00:00 | 46.0 | 91.0 | 91.0 | 1.3 | 5.0 | 28.0 |
(49420, 8)
DatetimeIndex(['2018-01-31 16:00:00', '2018-01-31 17:00:00',
'2018-01-31 18:00:00', '2018-01-31 19:00:00',
'2018-01-31 20:00:00', '2018-01-31 21:00:00',
'2018-01-31 22:00:00', '2018-01-31 23:00:00',
'2018-02-01 00:00:00', '2018-02-01 01:00:00',
...
'2018-03-31 06:00:00', '2018-03-31 07:00:00',
'2018-03-31 08:00:00', '2018-03-31 09:00:00',
'2018-03-31 10:00:00', '2018-03-31 11:00:00',
'2018-03-31 12:00:00', '2018-03-31 13:00:00',
'2018-03-31 14:00:00', '2018-03-31 15:00:00'],
dtype='datetime64[ns]', length=1412, freq=None)
#Grid Weather table
display(grid_station_data.head(3))
display(grid_station_data.shape)
display(sortAndUniqueTime(grid_station_data.utc_time))
| stationName | longitude | latitude | utc_time | temperature | pressure | humidity | wind_direction | wind_speed/kph | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | beijing_grid_000 | 115.0 | 39.0 | 2017-01-01 00:00:00 | -5.47 | 984.73 | 76.6 | 53.71 | 3.53 |
| 1 | beijing_grid_001 | 115.0 | 39.1 | 2017-01-01 00:00:00 | -5.53 | 979.33 | 75.4 | 43.59 | 3.11 |
| 2 | beijing_grid_002 | 115.0 | 39.2 | 2017-01-01 00:00:00 | -5.70 | 963.14 | 71.8 | 0.97 | 2.75 |
(7034706, 9)
DatetimeIndex(['2017-01-01 00:00:00', '2017-01-01 01:00:00',
'2017-01-01 02:00:00', '2017-01-01 03:00:00',
'2017-01-01 04:00:00', '2017-01-01 05:00:00',
'2017-01-01 06:00:00', '2017-01-01 07:00:00',
'2017-01-01 08:00:00', '2017-01-01 09:00:00',
...
'2018-03-26 20:00:00', '2018-03-26 21:00:00',
'2018-03-26 22:00:00', '2018-03-26 23:00:00',
'2018-03-27 00:00:00', '2018-03-27 01:00:00',
'2018-03-27 02:00:00', '2018-03-27 03:00:00',
'2018-03-27 04:00:00', '2018-03-27 05:00:00'],
dtype='datetime64[ns]', length=10806, freq=None)
#observed weather table
display(observed_station_data.head(3))
display(observed_station_data.shape)
# sort & display time
ob_station_data = observed_station_data.utc_time.unique()
ob_station_data = pd.to_datetime(ob_station_data).sort_values()
display(ob_station_data)
| station_id | longitude | latitude | utc_time | temperature | pressure | humidity | wind_direction | wind_speed | weather | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | shunyi_meo | 116.615278 | 40.126667 | 2017-01-30 16:00:00 | -1.7 | 1028.7 | 15 | 215.0 | 1.6 | Sunny/clear |
| 1 | shunyi_meo | 116.615278 | 40.126667 | 2017-01-30 17:00:00 | -3.5 | 1028.4 | 24 | 16.0 | 1.0 | Haze |
| 2 | shunyi_meo | 116.615278 | 40.126667 | 2017-01-30 18:00:00 | -3.7 | 1028.1 | 27 | 32.0 | 1.1 | Haze |
(158047, 10)
DatetimeIndex(['2017-01-30 16:00:00', '2017-01-30 17:00:00',
'2017-01-30 18:00:00', '2017-01-30 19:00:00',
'2017-01-30 20:00:00', '2017-01-30 21:00:00',
'2017-01-30 22:00:00', '2017-01-30 23:00:00',
'2017-01-31 00:00:00', '2017-01-31 01:00:00',
...
'2018-01-31 06:00:00', '2018-01-31 07:00:00',
'2018-01-31 08:00:00', '2018-01-31 09:00:00',
'2018-01-31 10:00:00', '2018-01-31 11:00:00',
'2018-01-31 12:00:00', '2018-01-31 13:00:00',
'2018-01-31 14:00:00', '2018-01-31 15:00:00'],
dtype='datetime64[ns]', length=8782, freq=None)
# display station tables
print(air_quality_stations.shape)
display(air_quality_stations.head())
print(grid_weather_stations.shape)
display(grid_weather_stations.head())
print(observed_weather_stations.shape)
display(observed_weather_stations.head())
(35, 2)
| longitude | latitude | |
|---|---|---|
| station_id | ||
| dongsi_aq | 116.417 | 39.929 |
| tiantan_aq | 116.407 | 39.886 |
| guanyuan_aq | 116.339 | 39.929 |
| wanshouxigong_aq | 116.352 | 39.878 |
| aotizhongxin_aq | 116.397 | 39.982 |
(651, 2)
| latitude | longitude | |
|---|---|---|
| station_id | ||
| beijing_grid_000 | 39.0 | 115.0 |
| beijing_grid_001 | 39.1 | 115.0 |
| beijing_grid_002 | 39.2 | 115.0 |
| beijing_grid_003 | 39.3 | 115.0 |
| beijing_grid_004 | 39.4 | 115.0 |
(18, 2)
| longitude | latitude | |
|---|---|---|
| station_id | ||
| shunyi_meo | 116.615278 | 40.126667 |
| hadian_meo | 116.290556 | 39.986944 |
| yanqing_meo | 115.968889 | 40.449444 |
| miyun_meo | 116.864167 | 40.377500 |
| huairou_meo | 116.626944 | 40.357778 |
# Concat Stations
stations_all_df = pd.concat([grid_weather_stations, observed_weather_stations], axis=0, sort=False)
stations_all_df.head()
| latitude | longitude | |
|---|---|---|
| station_id | ||
| beijing_grid_000 | 39.0 | 115.0 |
| beijing_grid_001 | 39.1 | 115.0 |
| beijing_grid_002 | 39.2 | 115.0 |
| beijing_grid_003 | 39.3 | 115.0 |
| beijing_grid_004 | 39.4 | 115.0 |
stations_all_df.iloc[0].name
'beijing_grid_000'
from sklearn.neighbors import NearestNeighbors
def KNNDataFrame(k_nn=3):
nn_finder = NearestNeighbors(n_neighbors=k_nn, metric='haversine', n_jobs=-1)
nn_finder.fit(stations_all_df)
# find the KNNs of the air quality stations
# add new columns
aq_stations_knn_df = air_quality_stations.copy()
for i in range(k_nn):
n_name = "N{}".format(i)
n_lat = "N{}_lat".format(i)
n_lng = "N{}_lng".format(i)
n_dist = "N{}_dist".format(i)
aq_stations_knn_df[n_name] = pd.Series('', index=aq_stations_knn_df.index)
aq_stations_knn_df[n_lat] = pd.Series(.0, index=aq_stations_knn_df.index)
aq_stations_knn_df[n_lng] = pd.Series(.0, index=aq_stations_knn_df.index)
aq_stations_knn_df[n_dist] = pd.Series(.0, index=aq_stations_knn_df.index)
for station in aq_stations_knn_df.index:
latitude = aq_stations_knn_df.at[station, lat]
longitude = aq_stations_knn_df.at[station, lng]
result = nn_finder.kneighbors([[latitude, longitude]], return_distance=True)
index_l = result[1].ravel()
dist_l = result[0].ravel()
# add to dataframe
results_zip = list(zip(index_l, dist_l))
# store into dataframe
for i, neighbor_list in enumerate(results_zip):
neighbor_index = neighbor_list[0]
# name of neighbor i
neighbor_name = stations_all_df.iloc[neighbor_index].name
aq_stations_knn_df.at[station, 'N{}'.format(i)] = neighbor_name
# latitude of neighbor i
aq_stations_knn_df.at[station, 'N{}_lat'.format(i)] = stations_all_df.at[neighbor_name, lat]
# longitude of neighbor i
aq_stations_knn_df.at[station, 'N{}_lng'.format(i)] = stations_all_df.at[neighbor_name, lng]
# distance of neighbor i
R = 6371.0 # km - earths's radius
neighbor_dist = neighbor_list[1] * R
aq_stations_knn_df.at[station, 'N{}_dist'.format(i)] = neighbor_dist
print(results_zip)
return(aq_stations_knn_df)
aq_stations_knn_df = KNNDataFrame(k_nn=k_nn)
display(aq_stations_knn_df.head())
print(aq_stations_knn_df.shape)
[(303, 0.03074564946081279), (659, 0.05715319502442078), (324, 0.05767359548541588)] [(303, 0.014583690456893396), (324, 0.05603132186058166), (282, 0.06396742401445471)] [(282, 0.03728090443487231), (303, 0.04672897673407377), (652, 0.06559306199188342)] [(303, 0.03549001722986746), (282, 0.03733850271060311), (665, 0.06110136326287754)] [(304, 0.018108625524897788), (283, 0.06650378454411798), (652, 0.06990770888901224)] [(659, 0.029316950408647114), (324, 0.043858471761035446), (303, 0.05220670870271248)] [(652, 0.0023372803482232204), (283, 0.015590884211383995), (262, 0.05903792336716099)] [(263, 0.021558860402400737), (242, 0.05526685901956946), (262, 0.09181558361545737)] [(262, 0.005083712676395151), (652, 0.05738019379336394), (664, 0.05951038490420543)] [(665, 0.020298517778200444), (282, 0.038913550197383524), (261, 0.05851585033676683)] [(239, 0.03374787786448513), (260, 0.03676554387125713), (667, 0.056428052072776444)] [(261, 0.01692858557999603), (662, 0.030867762433433248), (664, 0.031331167141164656)] [(667, 0.041307217565432035), (238, 0.044830630315907526), (259, 0.0504049244278248)] [(301, 0.018080112190774326), (666, 0.021490319832254474), (322, 0.04458539439303875)] [(323, 0.0058416238649557336), (663, 0.02157919181541224), (344, 0.04757466524263286)] [(366, 0.02573125488872463), (345, 0.0393333834740917), (658, 0.06518944745272487)] [(651, 0.03002436389007527), (368, 0.04311334201755241), (347, 0.04915897047023217)] [(660, 0.016204181407483694), (264, 0.02957490712361898), (285, 0.058968673098970074)] [(240, 0.03717696657622494), (662, 0.057733019935713094), (664, 0.061866752415177854)] [(657, 0.02981570883905368), (452, 0.04299999999999926), (453, 0.05700000000000216)] [(655, 0.02979221500871478), (349, 0.037010819699075555), (370, 0.06824294145418926)] [(654, 0.029685691147413804), (392, 0.04153971300740192), (413, 0.06802681327608798)] [(653, 0.004574435503476028), (225, 0.0537897505823065), (224, 0.05887071626426153)] [(265, 0.018883748567083004), (286, 0.06888328231066637), (660, 0.06901783610408137)] [(224, 0.03661698716073438), (223, 0.06583940599239149), (653, 0.08620536974018635)] [(414, 0.010413992343919414), (435, 0.0838722097568699), (413, 0.09952024859642693)] [(452, 0.01475975481735179), (473, 0.0590322959009603), (657, 0.06946500466781169)] [(385, 0.013984054301872743), (364, 0.03704525770986642), (406, 0.050830072587286425)] [(278, 0.020000000000003126), (257, 0.0310490489620633), (299, 0.031049048962065878)] [(216, 0.020000000000003126), (237, 0.0372611468220792), (195, 0.0372611468220792)] [(303, 0.0031092696816156537), (282, 0.055932594438877284), (324, 0.06181507636656065)] [(303, 0.024250457564372108), (282, 0.05950357491986034), (324, 0.06591985907008456)] [(652, 0.05003921139719583), (283, 0.05593052894689947), (304, 0.05667918816059602)] [(303, 0.047642551006313386), (302, 0.058502162083371285), (282, 0.05867766200530408)] [(659, 0.01751249101075208), (324, 0.040331725198132246), (325, 0.061972794356129644)]
| longitude | latitude | N0 | N0_lat | N0_lng | N0_dist | N1 | N1_lat | N1_lng | N1_dist | N2 | N2_lat | N2_lng | N2_dist | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| station_id | ||||||||||||||
| dongsi_aq | 116.417 | 39.929 | beijing_grid_303 | 39.9 | 116.4 | 195.880533 | chaoyang_meo | 39.9525 | 116.500833 | 364.123006 | beijing_grid_324 | 39.900000 | 116.500000 | 367.438477 |
| tiantan_aq | 116.407 | 39.886 | beijing_grid_303 | 39.9 | 116.4 | 92.912692 | beijing_grid_324 | 39.9000 | 116.500000 | 356.975552 | beijing_grid_282 | 39.900000 | 116.300000 | 407.536458 |
| guanyuan_aq | 116.339 | 39.929 | beijing_grid_282 | 39.9 | 116.3 | 237.516642 | beijing_grid_303 | 39.9000 | 116.400000 | 297.710311 | hadian_meo | 39.986944 | 116.290556 | 417.893398 |
| wanshouxigong_aq | 116.352 | 39.878 | beijing_grid_303 | 39.9 | 116.4 | 226.106900 | beijing_grid_282 | 39.9000 | 116.300000 | 237.883601 | fengtai_meo | 39.870278 | 116.245278 | 389.276785 |
| aotizhongxin_aq | 116.397 | 39.982 | beijing_grid_304 | 40.0 | 116.4 | 115.370053 | beijing_grid_283 | 40.0000 | 116.300000 | 423.695611 | hadian_meo | 39.986944 | 116.290556 | 445.382013 |
(35, 14)
display(air_quality_201701_201801.head(3))
display(air_quality_201701_201801.shape)
print(air_quality_201701_201801.dtypes)
display(air_quality_201802_201803.head(3))
display(air_quality_201802_201803.shape)
| stationId | utc_time | PM2.5 | PM10 | NO2 | CO | O3 | SO2 | |
|---|---|---|---|---|---|---|---|---|
| 0 | aotizhongxin_aq | 2017-01-01 14:00:00 | 453.0 | 467.0 | 156.0 | 7.2 | 3.0 | 9.0 |
| 1 | aotizhongxin_aq | 2017-01-01 15:00:00 | 417.0 | 443.0 | 143.0 | 6.8 | 2.0 | 8.0 |
| 2 | aotizhongxin_aq | 2017-01-01 16:00:00 | 395.0 | 467.0 | 141.0 | 6.9 | 3.0 | 8.0 |
(311010, 8)
stationId object utc_time object PM2.5 float64 PM10 float64 NO2 float64 CO float64 O3 float64 SO2 float64 dtype: object
| stationId | utc_time | PM2.5 | PM10 | NO2 | CO | O3 | SO2 | |
|---|---|---|---|---|---|---|---|---|
| 0 | aotizhongxin_aq | 2018-01-31 16:00:00 | 49.0 | 82.0 | 90.0 | 0.9 | 6.0 | 10.0 |
| 1 | aotizhongxin_aq | 2018-01-31 17:00:00 | 47.0 | 80.0 | 90.0 | 0.9 | 5.0 | 10.0 |
| 2 | aotizhongxin_aq | 2018-01-31 18:00:00 | 46.0 | 91.0 | 91.0 | 1.3 | 5.0 | 28.0 |
(49420, 8)
import datetime as dt
# concat 201701_201801 table
airQ_data = pd.concat([air_quality_201701_201801, air_quality_201802_201803], axis=0, ignore_index=True)
airQ_data['utc_time'] = airQ_data["utc_time"].apply(lambda x: pd.Timestamp(x))
airQ_data = airQ_data.rename(columns={"stationId": "station_id",
"utc_time": "time"})
# drop duplicates
airQ_data = airQ_data.drop_duplicates()
display(airQ_data.head())
print(airQ_data.shape)
| station_id | time | PM2.5 | PM10 | NO2 | CO | O3 | SO2 | |
|---|---|---|---|---|---|---|---|---|
| 0 | aotizhongxin_aq | 2017-01-01 14:00:00 | 453.0 | 467.0 | 156.0 | 7.2 | 3.0 | 9.0 |
| 1 | aotizhongxin_aq | 2017-01-01 15:00:00 | 417.0 | 443.0 | 143.0 | 6.8 | 2.0 | 8.0 |
| 2 | aotizhongxin_aq | 2017-01-01 16:00:00 | 395.0 | 467.0 | 141.0 | 6.9 | 3.0 | 8.0 |
| 3 | aotizhongxin_aq | 2017-01-01 17:00:00 | 420.0 | 484.0 | 139.0 | 7.4 | 3.0 | 9.0 |
| 4 | aotizhongxin_aq | 2017-01-01 18:00:00 | 453.0 | 520.0 | 157.0 | 7.6 | 4.0 | 9.0 |
(353955, 8)
airQ_data.isnull().sum()
station_id 0 time 0 PM2.5 22834 PM10 94780 NO2 21124 CO 45565 O3 23145 SO2 21076 dtype: int64
airQ_data.isnull().sum() * 100 / len(airQ_data)
station_id 0.000000 time 0.000000 PM2.5 6.451103 PM10 26.777415 NO2 5.967990 CO 12.873105 O3 6.538967 SO2 5.954429 dtype: float64
airQ_data.describe()
| PM2.5 | PM10 | NO2 | CO | O3 | SO2 | |
|---|---|---|---|---|---|---|
| count | 331121.000000 | 259175.000000 | 332831.000000 | 308390.000000 | 330810.000000 | 332879.000000 |
| mean | 61.191407 | 91.342772 | 46.066382 | 0.971086 | 55.052323 | 9.199598 |
| std | 67.385924 | 103.024487 | 32.246534 | 0.973267 | 51.610243 | 11.596243 |
| min | 2.000000 | 5.000000 | 1.000000 | 0.100000 | 1.000000 | 1.000000 |
| 25% | 16.000000 | 38.000000 | 20.000000 | 0.400000 | 13.000000 | 2.000000 |
| 50% | 41.000000 | 71.000000 | 40.000000 | 0.700000 | 46.000000 | 5.000000 |
| 75% | 81.000000 | 116.000000 | 66.000000 | 1.200000 | 78.000000 | 12.000000 |
| max | 1574.000000 | 3280.000000 | 300.000000 | 15.000000 | 504.000000 | 307.000000 |
display(airQ_data.iloc[np.r_[0:1, -1:0]])
display(airQ_data.shape)
| station_id | time | PM2.5 | PM10 | NO2 | CO | O3 | SO2 | |
|---|---|---|---|---|---|---|---|---|
| 0 | aotizhongxin_aq | 2017-01-01 14:00:00 | 453.0 | 467.0 | 156.0 | 7.2 | 3.0 | 9.0 |
| 360429 | zhiwuyuan_aq | 2018-03-31 15:00:00 | NaN | NaN | NaN | NaN | NaN | NaN |
(353955, 8)
airQ_data_new = airQ_data.copy()
airQ_data_new = airQ_data_new.reset_index().drop(columns=['index'])
display(airQ_data_new.iloc[np.r_[0:1, -1:0]])
display(airQ_data_new.shape)
| station_id | time | PM2.5 | PM10 | NO2 | CO | O3 | SO2 | |
|---|---|---|---|---|---|---|---|---|
| 0 | aotizhongxin_aq | 2017-01-01 14:00:00 | 453.0 | 467.0 | 156.0 | 7.2 | 3.0 | 9.0 |
| 353954 | zhiwuyuan_aq | 2018-03-31 15:00:00 | NaN | NaN | NaN | NaN | NaN | NaN |
(353955, 8)
len(airQ_data.time.unique())
10113
display(grid_station_data.head(5))
display(grid_station_data.shape)
| stationName | longitude | latitude | utc_time | temperature | pressure | humidity | wind_direction | wind_speed/kph | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | beijing_grid_000 | 115.0 | 39.0 | 2017-01-01 00:00:00 | -5.47 | 984.73 | 76.60 | 53.71 | 3.53 |
| 1 | beijing_grid_001 | 115.0 | 39.1 | 2017-01-01 00:00:00 | -5.53 | 979.33 | 75.40 | 43.59 | 3.11 |
| 2 | beijing_grid_002 | 115.0 | 39.2 | 2017-01-01 00:00:00 | -5.70 | 963.14 | 71.80 | 0.97 | 2.75 |
| 3 | beijing_grid_003 | 115.0 | 39.3 | 2017-01-01 00:00:00 | -5.88 | 946.94 | 68.20 | 327.65 | 3.84 |
| 4 | beijing_grid_004 | 115.0 | 39.4 | 2017-01-01 00:00:00 | -5.34 | 928.80 | 58.81 | 317.85 | 6.14 |
(7034706, 9)
grid_station_data['time'] = grid_station_data.utc_time.apply(lambda x: pd.Timestamp(x))
grid_station_data = grid_station_data.drop(['utc_time', 'longitude', 'latitude'], axis=1)
grid_station_data = grid_station_data.rename(columns={'stationName': 'station_id', 'wind_speed/kph': 'wind_speed'})
grid_station_data['weather'] = pd.Series('None', index=grid_station_data.index)
grid_station_data.time.unique()
print(grid_station_data.shape)
display(grid_station_data.head())
(7034706, 8)
| station_id | temperature | pressure | humidity | wind_direction | wind_speed | time | weather | |
|---|---|---|---|---|---|---|---|---|
| 0 | beijing_grid_000 | -5.47 | 984.73 | 76.60 | 53.71 | 3.53 | 2017-01-01 | None |
| 1 | beijing_grid_001 | -5.53 | 979.33 | 75.40 | 43.59 | 3.11 | 2017-01-01 | None |
| 2 | beijing_grid_002 | -5.70 | 963.14 | 71.80 | 0.97 | 2.75 | 2017-01-01 | None |
| 3 | beijing_grid_003 | -5.88 | 946.94 | 68.20 | 327.65 | 3.84 | 2017-01-01 | None |
| 4 | beijing_grid_004 | -5.34 | 928.80 | 58.81 | 317.85 | 6.14 | 2017-01-01 | None |
grid_station_data_new = grid_station_data.copy()
grid_station_data_new = grid_station_data_new.reset_index().drop(columns=['index'])
display(grid_station_data_new.iloc[np.r_[0:1, -1:0]])
display(grid_station_data_new.shape)
| station_id | temperature | pressure | humidity | wind_direction | wind_speed | time | weather | |
|---|---|---|---|---|---|---|---|---|
| 0 | beijing_grid_000 | -5.47 | 984.73 | 76.60 | 53.71 | 3.53 | 2017-01-01 00:00:00 | None |
| 7034705 | beijing_grid_650 | 22.73 | 942.95 | 21.78 | 215.27 | 18.91 | 2018-03-27 05:00:00 | None |
(7034706, 8)
display(observed_station_data.head(5))
display(observed_station_data.shape)
| station_id | longitude | latitude | utc_time | temperature | pressure | humidity | wind_direction | wind_speed | weather | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | shunyi_meo | 116.615278 | 40.126667 | 2017-01-30 16:00:00 | -1.7 | 1028.7 | 15 | 215.0 | 1.6 | Sunny/clear |
| 1 | shunyi_meo | 116.615278 | 40.126667 | 2017-01-30 17:00:00 | -3.5 | 1028.4 | 24 | 16.0 | 1.0 | Haze |
| 2 | shunyi_meo | 116.615278 | 40.126667 | 2017-01-30 18:00:00 | -3.7 | 1028.1 | 27 | 32.0 | 1.1 | Haze |
| 3 | shunyi_meo | 116.615278 | 40.126667 | 2017-01-30 19:00:00 | -3.9 | 1027.6 | 27 | 21.0 | 0.9 | Haze |
| 4 | shunyi_meo | 116.615278 | 40.126667 | 2017-01-30 20:00:00 | -4.1 | 1026.7 | 26 | 17.0 | 1.1 | Haze |
(158047, 10)
observed_station_data['utc_time'] = observed_station_data.utc_time.apply(lambda x: pd.Timestamp(x))
observed_station_data = observed_station_data.rename(columns={'utc_time': 'time'})
observed_station_data = observed_station_data.drop(['longitude', 'latitude'], axis=1)
print(observed_station_data.shape)
display(observed_station_data.head())
(158047, 8)
| station_id | time | temperature | pressure | humidity | wind_direction | wind_speed | weather | |
|---|---|---|---|---|---|---|---|---|
| 0 | shunyi_meo | 2017-01-30 16:00:00 | -1.7 | 1028.7 | 15 | 215.0 | 1.6 | Sunny/clear |
| 1 | shunyi_meo | 2017-01-30 17:00:00 | -3.5 | 1028.4 | 24 | 16.0 | 1.0 | Haze |
| 2 | shunyi_meo | 2017-01-30 18:00:00 | -3.7 | 1028.1 | 27 | 32.0 | 1.1 | Haze |
| 3 | shunyi_meo | 2017-01-30 19:00:00 | -3.9 | 1027.6 | 27 | 21.0 | 0.9 | Haze |
| 4 | shunyi_meo | 2017-01-30 20:00:00 | -4.1 | 1026.7 | 26 | 17.0 | 1.1 | Haze |
observed_station_data_new = observed_station_data.copy()
observed_station_data_new = observed_station_data_new.reset_index().drop(columns=['index'])
display(observed_station_data_new.iloc[np.r_[0:1, -1:0]])
display(observed_station_data_new.shape)
| station_id | time | temperature | pressure | humidity | wind_direction | wind_speed | weather | |
|---|---|---|---|---|---|---|---|---|
| 0 | shunyi_meo | 2017-01-30 16:00:00 | -1.7 | 1028.7 | 15 | 215.0 | 1.6 | Sunny/clear |
| 158046 | xiayunling_meo | 2018-01-31 15:00:00 | -7.4 | 977.4 | 39 | 137.0 | 1.0 | Sunny/clear |
(158047, 8)
weather_data = pd.concat([grid_station_data, observed_station_data], axis=0, ignore_index=True, sort=False)
print(weather_data.shape)
display(weather_data.head())
(7192753, 8)
| station_id | temperature | pressure | humidity | wind_direction | wind_speed | time | weather | |
|---|---|---|---|---|---|---|---|---|
| 0 | beijing_grid_000 | -5.47 | 984.73 | 76.60 | 53.71 | 3.53 | 2017-01-01 | None |
| 1 | beijing_grid_001 | -5.53 | 979.33 | 75.40 | 43.59 | 3.11 | 2017-01-01 | None |
| 2 | beijing_grid_002 | -5.70 | 963.14 | 71.80 | 0.97 | 2.75 | 2017-01-01 | None |
| 3 | beijing_grid_003 | -5.88 | 946.94 | 68.20 | 327.65 | 3.84 | 2017-01-01 | None |
| 4 | beijing_grid_004 | -5.34 | 928.80 | 58.81 | 317.85 | 6.14 | 2017-01-01 | None |
weather_data_new = weather_data.copy()
weather_data_new = weather_data_new
display(weather_data_new.iloc[np.r_[0:1, -1:0]])
display(weather_data_new.shape)
| station_id | temperature | pressure | humidity | wind_direction | wind_speed | time | weather | |
|---|---|---|---|---|---|---|---|---|
| 0 | beijing_grid_000 | -5.47 | 984.73 | 76.6 | 53.71 | 3.53 | 2017-01-01 00:00:00 | None |
| 7192752 | xiayunling_meo | -7.40 | 977.40 | 39.0 | 137.00 | 1.00 | 2018-01-31 15:00:00 | Sunny/clear |
(7192753, 8)
print(observed_station_data[observed_station_data.wind_direction > 360.].wind_direction.unique())
weather_data.describe()
[999017. 999999.]
| temperature | pressure | humidity | wind_direction | wind_speed | |
|---|---|---|---|---|---|
| count | 7.192753e+06 | 7.192753e+06 | 7.192753e+06 | 7.192519e+06 | 7.192519e+06 |
| mean | 9.542558e+00 | 9.694494e+02 | 5.023605e+01 | 9.831192e+02 | 1.180248e+01 |
| std | 7.458267e+02 | 7.466754e+02 | 2.583257e+03 | 2.780709e+04 | 1.444125e+03 |
| min | -2.550000e+01 | 8.263900e+02 | 3.040000e+00 | 0.000000e+00 | 0.000000e+00 |
| 25% | -1.690000e+00 | 9.314800e+02 | 2.606000e+01 | 1.351800e+02 | 4.920000e+00 |
| 50% | 8.510000e+00 | 9.800700e+02 | 3.932000e+01 | 2.159300e+02 | 8.130000e+00 |
| 75% | 1.967000e+01 | 1.012500e+03 | 5.833000e+01 | 3.023600e+02 | 1.262000e+01 |
| max | 9.999990e+05 | 9.999990e+05 | 9.999990e+05 | 9.999990e+05 | 9.999990e+05 |
# change the extremingly big data to nan; change 999017 to 0.5
weather_data_float_types = ['temperature', 'pressure', 'humidity', 'wind_direction', 'wind_speed']
weather_data[weather_data_float_types] = weather_data[weather_data_float_types].applymap(lambda x: 0.5 if x == 999017 else (np.nan if x > 999017 else x))
weather_data.describe()
| temperature | pressure | humidity | wind_direction | wind_speed | |
|---|---|---|---|---|---|
| count | 7.192749e+06 | 7.192749e+06 | 7.192705e+06 | 7.192504e+06 | 7.192504e+06 |
| mean | 8.986448e+00 | 9.688938e+02 | 4.356296e+01 | 2.082134e+02 | 9.716998e+00 |
| std | 1.239194e+01 | 4.987219e+01 | 2.167502e+01 | 1.013677e+02 | 6.837274e+00 |
| min | -2.550000e+01 | 8.263900e+02 | 3.040000e+00 | 0.000000e+00 | 0.000000e+00 |
| 25% | -1.690000e+00 | 9.314800e+02 | 2.606000e+01 | 1.348300e+02 | 4.920000e+00 |
| 50% | 8.510000e+00 | 9.800600e+02 | 3.932000e+01 | 2.157100e+02 | 8.130000e+00 |
| 75% | 1.967000e+01 | 1.012500e+03 | 5.833000e+01 | 3.021900e+02 | 1.262000e+01 |
| max | 4.030000e+01 | 1.040620e+03 | 1.000000e+02 | 3.600000e+02 | 6.882000e+01 |
import seaborn as sns
sns.set(style="whitegrid")
ax = sns.boxplot(x=weather_data["temperature"])
sns.set(style="whitegrid")
ax = sns.boxplot(x=weather_data["pressure"])
sns.set(style="whitegrid")
ax = sns.boxplot(x=weather_data["humidity"])
sns.set(style="whitegrid")
ax = sns.boxplot(x=weather_data["wind_direction"])
sns.set(style="whitegrid")
ax = sns.boxplot(x=weather_data["wind_speed"])
# observed weather total time
observed_station_time = observed_station_data.time.unique()
observed_station_total_hours = len(observed_station_time)
display(observed_station_time)
print("The total time hour of observed weather station: {}".format(len(observed_station_data.time.unique())))
# Grid weather total time
grid_station_time = grid_station_data.time.unique()
grid_station_total_hours = len(grid_station_time)
display(grid_station_time)
print("The total time hour of grid weather station: {}".format(len(grid_station_data.time.unique())))
array(['2017-01-30T16:00:00.000000000', '2017-01-30T17:00:00.000000000',
'2017-01-30T18:00:00.000000000', ...,
'2018-01-31T14:00:00.000000000', '2018-01-31T15:00:00.000000000',
'2018-01-08T10:00:00.000000000'], dtype='datetime64[ns]')
The total time hour of observed weather station: 8782
array(['2017-01-01T00:00:00.000000000', '2017-01-01T01:00:00.000000000',
'2017-01-01T02:00:00.000000000', ...,
'2018-03-27T03:00:00.000000000', '2018-03-27T04:00:00.000000000',
'2018-03-27T05:00:00.000000000'], dtype='datetime64[ns]')
The total time hour of grid weather station: 10806
# concat the weather features from 2017-01-30 16:00:00 to 2018-05-02 23:00:00
#import ipdb
def getWeatherFeatures(airQ_station_id):
# Find the K neighbors
knn_list = []
for i in range(k_nn):
name = "N{}".format(i)
knn_list.append(aq_stations_knn_df.at[airQ_station_id, name])
# Initialize the features dataframe with time period
time_period = pd.date_range(start='2017-01-30 16:00:00', end='2018-05-02 23:00:00', freq="H")
data_df = pd.DataFrame(time_period, columns=['time'])
# merge the data from Air Quality Stations
airQ_data_df = airQ_data[airQ_data.station_id == airQ_station_id]
airQ_data_df = airQ_data_df.drop(columns=['station_id'])
data_df = data_df.merge(airQ_data_df, on='time', how='left')
# add the features from k neighbors
for k, station in enumerate(knn_list):
# data of the station
station_data = weather_data[weather_data.station_id == station]
# drop the station_id
station_data = station_data.drop(columns=['station_id'])
# add the station_id to the columns name
station_data = station_data.rename(columns={
"temperature": "{}_{}_tem".format(k, station),
"pressure": "{}_{}_pre".format(k, station),
"humidity": "{}_{}_hum".format(k, station),
"wind_direction": "{}_{}_wd".format(k, station),
"wind_speed": "{}_{}_ws".format(k, station),
"weather": "{}_{}_wea".format(k, station)
})
# merge the features from different time
data_df = data_df.merge(station_data, on='time', how='left')
print("==={} finished===".format(airQ_station_id))
return data_df
# The total length of time should be 10976
# from 2017-01-30 16:00:00 to 2018-05-02 23:00:00
atzx = getWeatherFeatures("aotizhongxin_aq")
display(atzx.shape)
display(atzx.head())
===aotizhongxin_aq finished===
(10976, 25)
| time | PM2.5 | PM10 | NO2 | CO | O3 | SO2 | 0_beijing_grid_304_tem | 0_beijing_grid_304_pre | 0_beijing_grid_304_hum | ... | 1_beijing_grid_283_hum | 1_beijing_grid_283_wd | 1_beijing_grid_283_ws | 1_beijing_grid_283_wea | 2_hadian_meo_tem | 2_hadian_meo_pre | 2_hadian_meo_hum | 2_hadian_meo_wd | 2_hadian_meo_ws | 2_hadian_meo_wea | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2017-01-30 16:00:00 | 70.0 | 75.0 | 36.0 | 0.9 | 79.0 | 34.0 | -5.89 | 1026.03 | 14.58 | ... | 14.07 | 201.74 | 6.24 | None | -1.6 | 1026.1 | 14.0 | 231.0 | 2.5 | Sunny/clear |
| 1 | 2017-01-30 17:00:00 | 78.0 | 86.0 | 36.0 | 0.1 | 78.0 | 38.0 | -6.16 | 1025.68 | 15.11 | ... | 14.62 | 199.01 | 5.02 | None | -2.0 | 1025.9 | 16.0 | 234.0 | 1.9 | Sunny/clear |
| 2 | 2017-01-30 18:00:00 | 86.0 | 92.0 | 39.0 | 1.1 | 74.0 | 35.0 | -6.44 | 1025.32 | 15.64 | ... | 15.18 | 194.52 | 3.81 | None | -2.9 | 1025.6 | 18.0 | 242.0 | 1.2 | Sunny/clear |
| 3 | 2017-01-30 19:00:00 | 95.0 | NaN | 46.0 | 1.2 | 65.0 | 34.0 | -7.02 | 1024.89 | 16.59 | ... | 16.02 | 189.14 | 3.40 | None | -3.0 | 1025.2 | 20.0 | 254.0 | 1.1 | Sunny/clear |
| 4 | 2017-01-30 20:00:00 | 98.0 | NaN | 42.0 | 1.2 | NaN | 32.0 | -7.61 | 1024.46 | 17.55 | ... | 16.86 | 182.36 | 3.02 | None | -3.8 | 1024.3 | 23.0 | 0.5 | 0.0 | Sunny/clear |
5 rows × 25 columns
# Store the dataframe into dictionarys
airQ_data_dict = {}
airQ_stations_list = list(aq_stations_knn_df.index)
airQ_stations_list.sort(reverse=False)
for station in airQ_stations_list:
airQ_data_dict[station] = getWeatherFeatures(station)
airQ_data_dict[station].to_csv("D:/Akhila/Air pollution data set/cleaned_data/{}.csv".format(station))
===aotizhongxin_aq finished=== ===badaling_aq finished=== ===beibuxinqu_aq finished=== ===daxing_aq finished=== ===dingling_aq finished=== ===donggaocun_aq finished=== ===dongsi_aq finished=== ===dongsihuan_aq finished=== ===fangshan_aq finished=== ===fengtaihuayuan_aq finished=== ===guanyuan_aq finished=== ===gucheng_aq finished=== ===huairou_aq finished=== ===liulihe_aq finished=== ===mentougou_aq finished=== ===miyun_aq finished=== ===miyunshuiku_aq finished=== ===nansanhuan_aq finished=== ===nongzhanguan_aq finished=== ===pingchang_aq finished=== ===pinggu_aq finished=== ===qianmen_aq finished=== ===shunyi_aq finished=== ===tiantan_aq finished=== ===tongzhou_aq finished=== ===wanliu_aq finished=== ===wanshouxigong_aq finished=== ===xizhimenbei_aq finished=== ===yanqin_aq finished=== ===yizhuang_aq finished=== ===yongdingmennei_aq finished=== ===yongledian_aq finished=== ===yufa_aq finished=== ===yungang_aq finished=== ===zhiwuyuan_aq finished===
import missingno as msno
%matplotlib inline
msno.matrix(atzx)
<AxesSubplot:>
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
import matplotlib
def missing_data_analysis(df, station):
missing_data_df = df
missing_rate_df = missing_data_df.isnull().sum() / len(missing_data_df)
missing_rate_df = pd.DataFrame(missing_rate_df).reset_index()
missing_rate_df.columns = ['col', 'rate']
# Bar Chart
fig, ax = plt.subplots(figsize=(30,10))
ax.bar(missing_rate_df['col'], missing_rate_df['rate'])
# ax.title.set_text(key)
ax.set_title(station, fontweight="bold", size=30) # Title
ax.set_ylabel('Missing Rate', fontsize = 20.0) # Y label
ax.set_xlabel('Columns', fontsize = 20) # X label
plt.setp(ax.get_xticklabels(), fontsize=16)
plt.setp(ax.get_yticklabels(), fontsize=16)
## Rotate date labels automatically
fig.autofmt_xdate()
plt.show()
def msno_analysis(df, station):
#fig, ax = plt.subplots(figsize=(30,10))
print("============== {} ================".format(station))
print(df.isnull().any())
msno.matrix(df)
msno_analysis(atzx, 'aotizhongxin')
============== aotizhongxin ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_beijing_grid_304_tem True 0_beijing_grid_304_pre True 0_beijing_grid_304_hum True 0_beijing_grid_304_wd True 0_beijing_grid_304_ws True 0_beijing_grid_304_wea True 1_beijing_grid_283_tem True 1_beijing_grid_283_pre True 1_beijing_grid_283_hum True 1_beijing_grid_283_wd True 1_beijing_grid_283_ws True 1_beijing_grid_283_wea True 2_hadian_meo_tem True 2_hadian_meo_pre True 2_hadian_meo_hum True 2_hadian_meo_wd True 2_hadian_meo_ws True 2_hadian_meo_wea True dtype: bool
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
import matplotlib
import os
# read files from directory
cleaned_data_path = 'D:/Akhila/Air pollution data set/cleaned_data/'
aq_file_list = []
airQ_data_dict = {}
# read the file names into list
for aq_file in os.listdir(cleaned_data_path):
if '.csv' in aq_file:
aq_file_list.append(aq_file)
airQ_data_dict[aq_file] = pd.read_csv(cleaned_data_path+aq_file, index_col=0)
# analyze the missing value
for key, value in airQ_data_dict.items():
missing_data_analysis(value, key)
msno_analysis(value, key)
============== aotizhongxin_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_beijing_grid_304_tem True 0_beijing_grid_304_pre True 0_beijing_grid_304_hum True 0_beijing_grid_304_wd True 0_beijing_grid_304_ws True 0_beijing_grid_304_wea True 1_beijing_grid_283_tem True 1_beijing_grid_283_pre True 1_beijing_grid_283_hum True 1_beijing_grid_283_wd True 1_beijing_grid_283_ws True 1_beijing_grid_283_wea True 2_hadian_meo_tem True 2_hadian_meo_pre True 2_hadian_meo_hum True 2_hadian_meo_wd True 2_hadian_meo_ws True 2_hadian_meo_wea True dtype: bool
============== badaling_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_beijing_grid_224_tem True 0_beijing_grid_224_pre True 0_beijing_grid_224_hum True 0_beijing_grid_224_wd True 0_beijing_grid_224_ws True 0_beijing_grid_224_wea True 1_beijing_grid_223_tem True 1_beijing_grid_223_pre True 1_beijing_grid_223_hum True 1_beijing_grid_223_wd True 1_beijing_grid_223_ws True 1_beijing_grid_223_wea True 2_yanqing_meo_tem True 2_yanqing_meo_pre True 2_yanqing_meo_hum True 2_yanqing_meo_wd True 2_yanqing_meo_ws True 2_yanqing_meo_wea True dtype: bool
============== beibuxinqu_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_beijing_grid_263_tem True 0_beijing_grid_263_pre True 0_beijing_grid_263_hum True 0_beijing_grid_263_wd True 0_beijing_grid_263_ws True 0_beijing_grid_263_wea True 1_beijing_grid_242_tem True 1_beijing_grid_242_pre True 1_beijing_grid_242_hum True 1_beijing_grid_242_wd True 1_beijing_grid_242_ws True 1_beijing_grid_242_wea True 2_beijing_grid_262_tem True 2_beijing_grid_262_pre True 2_beijing_grid_262_hum True 2_beijing_grid_262_wd True 2_beijing_grid_262_ws True 2_beijing_grid_262_wea True dtype: bool
============== daxing_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_beijing_grid_301_tem True 0_beijing_grid_301_pre True 0_beijing_grid_301_hum True 0_beijing_grid_301_wd True 0_beijing_grid_301_ws True 0_beijing_grid_301_wea True 1_daxing_meo_tem True 1_daxing_meo_pre True 1_daxing_meo_hum True 1_daxing_meo_wd True 1_daxing_meo_ws True 1_daxing_meo_wea True 2_beijing_grid_322_tem True 2_beijing_grid_322_pre True 2_beijing_grid_322_hum True 2_beijing_grid_322_wd True 2_beijing_grid_322_ws True 2_beijing_grid_322_wea True dtype: bool
============== dingling_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_beijing_grid_265_tem True 0_beijing_grid_265_pre True 0_beijing_grid_265_hum True 0_beijing_grid_265_wd True 0_beijing_grid_265_ws True 0_beijing_grid_265_wea True 1_beijing_grid_286_tem True 1_beijing_grid_286_pre True 1_beijing_grid_286_hum True 1_beijing_grid_286_wd True 1_beijing_grid_286_ws True 1_beijing_grid_286_wea True 2_pingchang_meo_tem True 2_pingchang_meo_pre True 2_pingchang_meo_hum True 2_pingchang_meo_wd True 2_pingchang_meo_ws True 2_pingchang_meo_wea True dtype: bool
============== donggaocun_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_beijing_grid_452_tem True 0_beijing_grid_452_pre True 0_beijing_grid_452_hum True 0_beijing_grid_452_wd True 0_beijing_grid_452_ws True 0_beijing_grid_452_wea True 1_beijing_grid_473_tem True 1_beijing_grid_473_pre True 1_beijing_grid_473_hum True 1_beijing_grid_473_wd True 1_beijing_grid_473_ws True 1_beijing_grid_473_wea True 2_pinggu_meo_tem True 2_pinggu_meo_pre True 2_pinggu_meo_hum True 2_pinggu_meo_wd True 2_pinggu_meo_ws True 2_pinggu_meo_wea True dtype: bool
============== dongsihuan_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_chaoyang_meo_tem True 0_chaoyang_meo_pre True 0_chaoyang_meo_hum True 0_chaoyang_meo_wd True 0_chaoyang_meo_ws True 0_chaoyang_meo_wea True 1_beijing_grid_324_tem True 1_beijing_grid_324_pre True 1_beijing_grid_324_hum True 1_beijing_grid_324_wd True 1_beijing_grid_324_ws True 1_beijing_grid_324_wea True 2_beijing_grid_325_tem True 2_beijing_grid_325_pre True 2_beijing_grid_325_hum True 2_beijing_grid_325_wd True 2_beijing_grid_325_ws True 2_beijing_grid_325_wea True dtype: bool
============== dongsi_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_beijing_grid_303_tem True 0_beijing_grid_303_pre True 0_beijing_grid_303_hum True 0_beijing_grid_303_wd True 0_beijing_grid_303_ws True 0_beijing_grid_303_wea True 1_chaoyang_meo_tem True 1_chaoyang_meo_pre True 1_chaoyang_meo_hum True 1_chaoyang_meo_wd True 1_chaoyang_meo_ws True 1_chaoyang_meo_wea True 2_beijing_grid_324_tem True 2_beijing_grid_324_pre True 2_beijing_grid_324_hum True 2_beijing_grid_324_wd True 2_beijing_grid_324_ws True 2_beijing_grid_324_wea True dtype: bool
============== fangshan_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_fangshan_meo_tem True 0_fangshan_meo_pre True 0_fangshan_meo_hum True 0_fangshan_meo_wd True 0_fangshan_meo_ws True 0_fangshan_meo_wea True 1_beijing_grid_238_tem True 1_beijing_grid_238_pre True 1_beijing_grid_238_hum True 1_beijing_grid_238_wd True 1_beijing_grid_238_ws True 1_beijing_grid_238_wea True 2_beijing_grid_259_tem True 2_beijing_grid_259_pre True 2_beijing_grid_259_hum True 2_beijing_grid_259_wd True 2_beijing_grid_259_ws True 2_beijing_grid_259_wea True dtype: bool
============== fengtaihuayuan_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_fengtai_meo_tem True 0_fengtai_meo_pre True 0_fengtai_meo_hum True 0_fengtai_meo_wd True 0_fengtai_meo_ws True 0_fengtai_meo_wea True 1_beijing_grid_282_tem True 1_beijing_grid_282_pre True 1_beijing_grid_282_hum True 1_beijing_grid_282_wd True 1_beijing_grid_282_ws True 1_beijing_grid_282_wea True 2_beijing_grid_261_tem True 2_beijing_grid_261_pre True 2_beijing_grid_261_hum True 2_beijing_grid_261_wd True 2_beijing_grid_261_ws True 2_beijing_grid_261_wea True dtype: bool
============== guanyuan_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_beijing_grid_282_tem True 0_beijing_grid_282_pre True 0_beijing_grid_282_hum True 0_beijing_grid_282_wd True 0_beijing_grid_282_ws True 0_beijing_grid_282_wea True 1_beijing_grid_303_tem True 1_beijing_grid_303_pre True 1_beijing_grid_303_hum True 1_beijing_grid_303_wd True 1_beijing_grid_303_ws True 1_beijing_grid_303_wea True 2_hadian_meo_tem True 2_hadian_meo_pre True 2_hadian_meo_hum True 2_hadian_meo_wd True 2_hadian_meo_ws True 2_hadian_meo_wea True dtype: bool
============== gucheng_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_beijing_grid_261_tem True 0_beijing_grid_261_pre True 0_beijing_grid_261_hum True 0_beijing_grid_261_wd True 0_beijing_grid_261_ws True 0_beijing_grid_261_wea True 1_mentougou_meo_tem True 1_mentougou_meo_pre True 1_mentougou_meo_hum True 1_mentougou_meo_wd True 1_mentougou_meo_ws True 1_mentougou_meo_wea True 2_shijingshan_meo_tem True 2_shijingshan_meo_pre True 2_shijingshan_meo_hum True 2_shijingshan_meo_wd True 2_shijingshan_meo_ws True 2_shijingshan_meo_wea True dtype: bool
============== huairou_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_huairou_meo_tem True 0_huairou_meo_pre True 0_huairou_meo_hum True 0_huairou_meo_wd True 0_huairou_meo_ws True 0_huairou_meo_wea True 1_beijing_grid_349_tem True 1_beijing_grid_349_pre True 1_beijing_grid_349_hum True 1_beijing_grid_349_wd True 1_beijing_grid_349_ws True 1_beijing_grid_349_wea True 2_beijing_grid_370_tem True 2_beijing_grid_370_pre True 2_beijing_grid_370_hum True 2_beijing_grid_370_wd True 2_beijing_grid_370_ws True 2_beijing_grid_370_wea True dtype: bool
============== liulihe_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_beijing_grid_216_tem True 0_beijing_grid_216_pre True 0_beijing_grid_216_hum True 0_beijing_grid_216_wd True 0_beijing_grid_216_ws True 0_beijing_grid_216_wea True 1_beijing_grid_237_tem True 1_beijing_grid_237_pre True 1_beijing_grid_237_hum True 1_beijing_grid_237_wd True 1_beijing_grid_237_ws True 1_beijing_grid_237_wea True 2_beijing_grid_195_tem True 2_beijing_grid_195_pre True 2_beijing_grid_195_hum True 2_beijing_grid_195_wd True 2_beijing_grid_195_ws True 2_beijing_grid_195_wea True dtype: bool
============== mentougou_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_beijing_grid_240_tem True 0_beijing_grid_240_pre True 0_beijing_grid_240_hum True 0_beijing_grid_240_wd True 0_beijing_grid_240_ws True 0_beijing_grid_240_wea True 1_mentougou_meo_tem True 1_mentougou_meo_pre True 1_mentougou_meo_hum True 1_mentougou_meo_wd True 1_mentougou_meo_ws True 1_mentougou_meo_wea True 2_shijingshan_meo_tem True 2_shijingshan_meo_pre True 2_shijingshan_meo_hum True 2_shijingshan_meo_wd True 2_shijingshan_meo_ws True 2_shijingshan_meo_wea True dtype: bool
============== miyunshuiku_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_beijing_grid_414_tem True 0_beijing_grid_414_pre True 0_beijing_grid_414_hum True 0_beijing_grid_414_wd True 0_beijing_grid_414_ws True 0_beijing_grid_414_wea True 1_beijing_grid_435_tem True 1_beijing_grid_435_pre True 1_beijing_grid_435_hum True 1_beijing_grid_435_wd True 1_beijing_grid_435_ws True 1_beijing_grid_435_wea True 2_beijing_grid_413_tem True 2_beijing_grid_413_pre True 2_beijing_grid_413_hum True 2_beijing_grid_413_wd True 2_beijing_grid_413_ws True 2_beijing_grid_413_wea True dtype: bool
============== miyun_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_miyun_meo_tem True 0_miyun_meo_pre True 0_miyun_meo_hum True 0_miyun_meo_wd True 0_miyun_meo_ws True 0_miyun_meo_wea True 1_beijing_grid_392_tem True 1_beijing_grid_392_pre True 1_beijing_grid_392_hum True 1_beijing_grid_392_wd True 1_beijing_grid_392_ws True 1_beijing_grid_392_wea True 2_beijing_grid_413_tem True 2_beijing_grid_413_pre True 2_beijing_grid_413_hum True 2_beijing_grid_413_wd True 2_beijing_grid_413_ws True 2_beijing_grid_413_wea True dtype: bool
============== nansanhuan_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_beijing_grid_303_tem True 0_beijing_grid_303_pre True 0_beijing_grid_303_hum True 0_beijing_grid_303_wd True 0_beijing_grid_303_ws True 0_beijing_grid_303_wea True 1_beijing_grid_302_tem True 1_beijing_grid_302_pre True 1_beijing_grid_302_hum True 1_beijing_grid_302_wd True 1_beijing_grid_302_ws True 1_beijing_grid_302_wea True 2_beijing_grid_282_tem True 2_beijing_grid_282_pre True 2_beijing_grid_282_hum True 2_beijing_grid_282_wd True 2_beijing_grid_282_ws True 2_beijing_grid_282_wea True dtype: bool
============== nongzhanguan_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_chaoyang_meo_tem True 0_chaoyang_meo_pre True 0_chaoyang_meo_hum True 0_chaoyang_meo_wd True 0_chaoyang_meo_ws True 0_chaoyang_meo_wea True 1_beijing_grid_324_tem True 1_beijing_grid_324_pre True 1_beijing_grid_324_hum True 1_beijing_grid_324_wd True 1_beijing_grid_324_ws True 1_beijing_grid_324_wea True 2_beijing_grid_303_tem True 2_beijing_grid_303_pre True 2_beijing_grid_303_hum True 2_beijing_grid_303_wd True 2_beijing_grid_303_ws True 2_beijing_grid_303_wea True dtype: bool
============== pingchang_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_pingchang_meo_tem True 0_pingchang_meo_pre True 0_pingchang_meo_hum True 0_pingchang_meo_wd True 0_pingchang_meo_ws True 0_pingchang_meo_wea True 1_beijing_grid_264_tem True 1_beijing_grid_264_pre True 1_beijing_grid_264_hum True 1_beijing_grid_264_wd True 1_beijing_grid_264_ws True 1_beijing_grid_264_wea True 2_beijing_grid_285_tem True 2_beijing_grid_285_pre True 2_beijing_grid_285_hum True 2_beijing_grid_285_wd True 2_beijing_grid_285_ws True 2_beijing_grid_285_wea True dtype: bool
============== pinggu_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_pinggu_meo_tem True 0_pinggu_meo_pre True 0_pinggu_meo_hum True 0_pinggu_meo_wd True 0_pinggu_meo_ws True 0_pinggu_meo_wea True 1_beijing_grid_452_tem True 1_beijing_grid_452_pre True 1_beijing_grid_452_hum True 1_beijing_grid_452_wd True 1_beijing_grid_452_ws True 1_beijing_grid_452_wea True 2_beijing_grid_453_tem True 2_beijing_grid_453_pre True 2_beijing_grid_453_hum True 2_beijing_grid_453_wd True 2_beijing_grid_453_ws True 2_beijing_grid_453_wea True dtype: bool
============== qianmen_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_beijing_grid_303_tem True 0_beijing_grid_303_pre True 0_beijing_grid_303_hum True 0_beijing_grid_303_wd True 0_beijing_grid_303_ws True 0_beijing_grid_303_wea True 1_beijing_grid_282_tem True 1_beijing_grid_282_pre True 1_beijing_grid_282_hum True 1_beijing_grid_282_wd True 1_beijing_grid_282_ws True 1_beijing_grid_282_wea True 2_beijing_grid_324_tem True 2_beijing_grid_324_pre True 2_beijing_grid_324_hum True 2_beijing_grid_324_wd True 2_beijing_grid_324_ws True 2_beijing_grid_324_wea True dtype: bool
============== shunyi_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_shunyi_meo_tem True 0_shunyi_meo_pre True 0_shunyi_meo_hum True 0_shunyi_meo_wd True 0_shunyi_meo_ws True 0_shunyi_meo_wea True 1_beijing_grid_368_tem True 1_beijing_grid_368_pre True 1_beijing_grid_368_hum True 1_beijing_grid_368_wd True 1_beijing_grid_368_ws True 1_beijing_grid_368_wea True 2_beijing_grid_347_tem True 2_beijing_grid_347_pre True 2_beijing_grid_347_hum True 2_beijing_grid_347_wd True 2_beijing_grid_347_ws True 2_beijing_grid_347_wea True dtype: bool
============== tiantan_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_beijing_grid_303_tem True 0_beijing_grid_303_pre True 0_beijing_grid_303_hum True 0_beijing_grid_303_wd True 0_beijing_grid_303_ws True 0_beijing_grid_303_wea True 1_beijing_grid_324_tem True 1_beijing_grid_324_pre True 1_beijing_grid_324_hum True 1_beijing_grid_324_wd True 1_beijing_grid_324_ws True 1_beijing_grid_324_wea True 2_beijing_grid_282_tem True 2_beijing_grid_282_pre True 2_beijing_grid_282_hum True 2_beijing_grid_282_wd True 2_beijing_grid_282_ws True 2_beijing_grid_282_wea True dtype: bool
============== tongzhou_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_beijing_grid_366_tem True 0_beijing_grid_366_pre True 0_beijing_grid_366_hum True 0_beijing_grid_366_wd True 0_beijing_grid_366_ws True 0_beijing_grid_366_wea True 1_beijing_grid_345_tem True 1_beijing_grid_345_pre True 1_beijing_grid_345_hum True 1_beijing_grid_345_wd True 1_beijing_grid_345_ws True 1_beijing_grid_345_wea True 2_tongzhou_meo_tem True 2_tongzhou_meo_pre True 2_tongzhou_meo_hum True 2_tongzhou_meo_wd True 2_tongzhou_meo_ws True 2_tongzhou_meo_wea True dtype: bool
============== wanliu_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_hadian_meo_tem True 0_hadian_meo_pre True 0_hadian_meo_hum True 0_hadian_meo_wd True 0_hadian_meo_ws True 0_hadian_meo_wea True 1_beijing_grid_283_tem True 1_beijing_grid_283_pre True 1_beijing_grid_283_hum True 1_beijing_grid_283_wd True 1_beijing_grid_283_ws True 1_beijing_grid_283_wea True 2_beijing_grid_262_tem True 2_beijing_grid_262_pre True 2_beijing_grid_262_hum True 2_beijing_grid_262_wd True 2_beijing_grid_262_ws True 2_beijing_grid_262_wea True dtype: bool
============== wanshouxigong_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_beijing_grid_303_tem True 0_beijing_grid_303_pre True 0_beijing_grid_303_hum True 0_beijing_grid_303_wd True 0_beijing_grid_303_ws True 0_beijing_grid_303_wea True 1_beijing_grid_282_tem True 1_beijing_grid_282_pre True 1_beijing_grid_282_hum True 1_beijing_grid_282_wd True 1_beijing_grid_282_ws True 1_beijing_grid_282_wea True 2_fengtai_meo_tem True 2_fengtai_meo_pre True 2_fengtai_meo_hum True 2_fengtai_meo_wd True 2_fengtai_meo_ws True 2_fengtai_meo_wea True dtype: bool
============== xizhimenbei_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_hadian_meo_tem True 0_hadian_meo_pre True 0_hadian_meo_hum True 0_hadian_meo_wd True 0_hadian_meo_ws True 0_hadian_meo_wea True 1_beijing_grid_283_tem True 1_beijing_grid_283_pre True 1_beijing_grid_283_hum True 1_beijing_grid_283_wd True 1_beijing_grid_283_ws True 1_beijing_grid_283_wea True 2_beijing_grid_304_tem True 2_beijing_grid_304_pre True 2_beijing_grid_304_hum True 2_beijing_grid_304_wd True 2_beijing_grid_304_ws True 2_beijing_grid_304_wea True dtype: bool
============== yanqin_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_yanqing_meo_tem True 0_yanqing_meo_pre True 0_yanqing_meo_hum True 0_yanqing_meo_wd True 0_yanqing_meo_ws True 0_yanqing_meo_wea True 1_beijing_grid_225_tem True 1_beijing_grid_225_pre True 1_beijing_grid_225_hum True 1_beijing_grid_225_wd True 1_beijing_grid_225_ws True 1_beijing_grid_225_wea True 2_beijing_grid_224_tem True 2_beijing_grid_224_pre True 2_beijing_grid_224_hum True 2_beijing_grid_224_wd True 2_beijing_grid_224_ws True 2_beijing_grid_224_wea True dtype: bool
============== yizhuang_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_beijing_grid_323_tem True 0_beijing_grid_323_pre True 0_beijing_grid_323_hum True 0_beijing_grid_323_wd True 0_beijing_grid_323_ws True 0_beijing_grid_323_wea True 1_beijing_meo_tem True 1_beijing_meo_pre True 1_beijing_meo_hum True 1_beijing_meo_wd True 1_beijing_meo_ws True 1_beijing_meo_wea True 2_beijing_grid_344_tem True 2_beijing_grid_344_pre True 2_beijing_grid_344_hum True 2_beijing_grid_344_wd True 2_beijing_grid_344_ws True 2_beijing_grid_344_wea True dtype: bool
============== yongdingmennei_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_beijing_grid_303_tem True 0_beijing_grid_303_pre True 0_beijing_grid_303_hum True 0_beijing_grid_303_wd True 0_beijing_grid_303_ws True 0_beijing_grid_303_wea True 1_beijing_grid_282_tem True 1_beijing_grid_282_pre True 1_beijing_grid_282_hum True 1_beijing_grid_282_wd True 1_beijing_grid_282_ws True 1_beijing_grid_282_wea True 2_beijing_grid_324_tem True 2_beijing_grid_324_pre True 2_beijing_grid_324_hum True 2_beijing_grid_324_wd True 2_beijing_grid_324_ws True 2_beijing_grid_324_wea True dtype: bool
============== yongledian_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_beijing_grid_385_tem True 0_beijing_grid_385_pre True 0_beijing_grid_385_hum True 0_beijing_grid_385_wd True 0_beijing_grid_385_ws True 0_beijing_grid_385_wea True 1_beijing_grid_364_tem True 1_beijing_grid_364_pre True 1_beijing_grid_364_hum True 1_beijing_grid_364_wd True 1_beijing_grid_364_ws True 1_beijing_grid_364_wea True 2_beijing_grid_406_tem True 2_beijing_grid_406_pre True 2_beijing_grid_406_hum True 2_beijing_grid_406_wd True 2_beijing_grid_406_ws True 2_beijing_grid_406_wea True dtype: bool
============== yufa_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_beijing_grid_278_tem True 0_beijing_grid_278_pre True 0_beijing_grid_278_hum True 0_beijing_grid_278_wd True 0_beijing_grid_278_ws True 0_beijing_grid_278_wea True 1_beijing_grid_257_tem True 1_beijing_grid_257_pre True 1_beijing_grid_257_hum True 1_beijing_grid_257_wd True 1_beijing_grid_257_ws True 1_beijing_grid_257_wea True 2_beijing_grid_299_tem True 2_beijing_grid_299_pre True 2_beijing_grid_299_hum True 2_beijing_grid_299_wd True 2_beijing_grid_299_ws True 2_beijing_grid_299_wea True dtype: bool
============== yungang_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_beijing_grid_239_tem True 0_beijing_grid_239_pre True 0_beijing_grid_239_hum True 0_beijing_grid_239_wd True 0_beijing_grid_239_ws True 0_beijing_grid_239_wea True 1_beijing_grid_260_tem True 1_beijing_grid_260_pre True 1_beijing_grid_260_hum True 1_beijing_grid_260_wd True 1_beijing_grid_260_ws True 1_beijing_grid_260_wea True 2_fangshan_meo_tem True 2_fangshan_meo_pre True 2_fangshan_meo_hum True 2_fangshan_meo_wd True 2_fangshan_meo_ws True 2_fangshan_meo_wea True dtype: bool
============== zhiwuyuan_aq.csv ================ time False PM2.5 True PM10 True NO2 True CO True O3 True SO2 True 0_beijing_grid_262_tem True 0_beijing_grid_262_pre True 0_beijing_grid_262_hum True 0_beijing_grid_262_wd True 0_beijing_grid_262_ws True 0_beijing_grid_262_wea True 1_hadian_meo_tem True 1_hadian_meo_pre True 1_hadian_meo_hum True 1_hadian_meo_wd True 1_hadian_meo_ws True 1_hadian_meo_wea True 2_shijingshan_meo_tem True 2_shijingshan_meo_pre True 2_shijingshan_meo_hum True 2_shijingshan_meo_wd True 2_shijingshan_meo_ws True 2_shijingshan_meo_wea True dtype: bool
#use the imputation algorithm to fill missing values
from sklearn.impute import KNNImputer
#import ipdb
def imputeWeatherData(data_df):
'''
Impute the weather data & Ignore the Air quality data
'''
# get all the columns
data_cols = data_df.columns
# air data columns
air_cols = ['time', 'PM2.5', 'PM10', 'NO2', 'CO', 'O3', 'SO2']
data_df['PM10'].fillna(data_df['PM10'].median(), inplace=True)
data_df['PM2.5'].fillna(data_df['PM2.5'].median(), inplace=True)
data_df['SO2'].fillna(data_df['SO2'].median(), inplace=True)
data_df['NO2'].fillna(data_df['NO2'].mean(), inplace=True)
data_df['CO'].fillna(data_df['CO'].median(), inplace=True)
data_df['O3'].fillna(data_df['O3'].median(), inplace=True)
# categorical weather features columns with 'time'
wea_cate_cols = [x for x in data_cols if '_wea' in x]
wea_cate_cols.append('time')
# weather columns without 'time'
wea_cols = list(set(data_cols) - set(air_cols) - set(wea_cate_cols) - set('time'))
# air quality & weather dataframe
air_quality_df = data_df[air_cols]
weather_cate_df = data_df[wea_cate_cols]
# result dataframe
result_df = air_quality_df.copy()
# impute the data by each KNN station
for i in range(k_nn):
impute_cols = [x for x in wea_cols if '{}_'.format(i) in x]
impute_cols.append('time')
impute_df = data_df[impute_cols]
impute_df.time = impute_df.time.reset_index()
imputer = KNNImputer(n_neighbors=3)
data_filled = imputer.fit_transform(impute_df)
data_filled_df = pd.DataFrame(data_filled, columns=impute_cols)
data_filled_df.time = pd.date_range(start='2017-01-30 16:00:00', end='2018-05-02 23:00:00', freq="H")
# merge into the result dataframe
result_df = result_df.merge(data_filled_df, on='time', how='left')
# one-hot encoding of the categorical weather dataframe & drop '%wea_None' columns
wea_cate_dummies = pd.get_dummies(weather_cate_df)
wea_none_cols = [x for x in wea_cate_dummies.columns if 'wea_None' in x]
wea_cate_dummies = wea_cate_dummies.drop(columns=wea_none_cols)
# concat the weather categorical features
result_df = result_df.merge(wea_cate_dummies, on='time', how='left')
return result_df
atzx_impute = imputeWeatherData(atzx)
atzx_impute
| time | PM2.5 | PM10 | NO2 | CO | O3 | SO2 | 0_beijing_grid_304_pre | 0_beijing_grid_304_wd | 0_beijing_grid_304_ws | ... | 2_hadian_meo_tem | 2_hadian_meo_ws | 2_hadian_meo_wd | 2_hadian_meo_wea_Dust | 2_hadian_meo_wea_Fog | 2_hadian_meo_wea_Haze | 2_hadian_meo_wea_Rain | 2_hadian_meo_wea_Sand | 2_hadian_meo_wea_Snow | 2_hadian_meo_wea_Sunny/clear | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2017-01-30 16:00:00 | 70.0 | 75.0 | 36.000000 | 0.9 | 79.0 | 34.0 | 1026.03 | 195.460000 | 7.130000 | ... | -1.600000 | 2.5 | 231.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 1 | 2017-01-30 17:00:00 | 78.0 | 86.0 | 36.000000 | 0.1 | 78.0 | 38.0 | 1025.68 | 191.220000 | 6.020000 | ... | -2.000000 | 1.9 | 234.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 2 | 2017-01-30 18:00:00 | 86.0 | 92.0 | 39.000000 | 1.1 | 74.0 | 35.0 | 1025.32 | 185.110000 | 4.960000 | ... | -2.900000 | 1.2 | 242.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 3 | 2017-01-30 19:00:00 | 95.0 | 66.0 | 46.000000 | 1.2 | 65.0 | 34.0 | 1024.89 | 181.280000 | 4.550000 | ... | -3.000000 | 1.1 | 254.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 4 | 2017-01-30 20:00:00 | 98.0 | 66.0 | 42.000000 | 1.2 | 50.0 | 32.0 | 1024.46 | 176.730000 | 4.170000 | ... | -3.800000 | 0.0 | 0.5 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10971 | 2018-05-02 19:00:00 | 37.0 | 66.0 | 50.596548 | 0.7 | 50.0 | 4.0 | 998.73 | 141.733333 | 6.733333 | ... | -5.266667 | 0.9 | 204.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 10972 | 2018-05-02 20:00:00 | 37.0 | 66.0 | 50.596548 | 0.7 | 50.0 | 4.0 | 998.73 | 141.733333 | 6.733333 | ... | -5.266667 | 0.9 | 204.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 10973 | 2018-05-02 21:00:00 | 37.0 | 66.0 | 50.596548 | 0.7 | 50.0 | 4.0 | 998.73 | 141.733333 | 6.733333 | ... | -5.266667 | 0.9 | 204.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 10974 | 2018-05-02 22:00:00 | 37.0 | 66.0 | 50.596548 | 0.7 | 50.0 | 4.0 | 998.73 | 141.733333 | 6.733333 | ... | -5.266667 | 0.9 | 204.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 10975 | 2018-05-02 23:00:00 | 37.0 | 66.0 | 50.596548 | 0.7 | 50.0 | 4.0 | 998.73 | 141.733333 | 6.733333 | ... | -5.266667 | 0.9 | 204.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10976 rows × 29 columns
atzx_impute.isnull().any()
time False PM2.5 False PM10 False NO2 False CO False O3 False SO2 False 0_beijing_grid_304_pre False 0_beijing_grid_304_wd False 0_beijing_grid_304_ws False 0_beijing_grid_304_tem False 0_beijing_grid_304_hum False 1_beijing_grid_283_pre False 1_beijing_grid_283_tem False 1_beijing_grid_283_ws False 1_beijing_grid_283_hum False 1_beijing_grid_283_wd False 2_hadian_meo_pre False 2_hadian_meo_hum False 2_hadian_meo_tem False 2_hadian_meo_ws False 2_hadian_meo_wd False 2_hadian_meo_wea_Dust False 2_hadian_meo_wea_Fog False 2_hadian_meo_wea_Haze False 2_hadian_meo_wea_Rain False 2_hadian_meo_wea_Sand False 2_hadian_meo_wea_Snow False 2_hadian_meo_wea_Sunny/clear False dtype: bool
missing_data_analysis(atzx_impute, 'imputed atzx')
msno.matrix(atzx_impute)
<AxesSubplot:>
airQ_data_dict['donggaocun_aq.csv'].dtypes
time object PM2.5 float64 PM10 float64 NO2 float64 CO float64 O3 float64 SO2 float64 0_beijing_grid_452_tem float64 0_beijing_grid_452_pre float64 0_beijing_grid_452_hum float64 0_beijing_grid_452_wd float64 0_beijing_grid_452_ws float64 0_beijing_grid_452_wea object 1_beijing_grid_473_tem float64 1_beijing_grid_473_pre float64 1_beijing_grid_473_hum float64 1_beijing_grid_473_wd float64 1_beijing_grid_473_ws float64 1_beijing_grid_473_wea object 2_pinggu_meo_tem float64 2_pinggu_meo_pre float64 2_pinggu_meo_hum float64 2_pinggu_meo_wd float64 2_pinggu_meo_ws float64 2_pinggu_meo_wea object dtype: object
import datetime
from matplotlib.dates import DateFormatter
index1=pd.date_range(start='2017-05-01 00:00:00', end=' 2017-05-01 23:00:00',freq="H")
df =pd.read_csv("D:/Akhila/Air pollution data set/imputed_data/tiantan_aq_imputed.csv")
df1=pd.read_csv("D:/Akhila/Air pollution data set/imputed_data/dongsi_aq_imputed.csv")
df2=pd.read_csv("D:/Akhila/Air pollution data set/imputed_data/huairou_aq_imputed.csv")
fig,ax = plt.subplots(figsize=(15, 5))
ax.plot(index1,df["PM2.5"].iloc[0:24], color='b', label='huairou_aq')
ax.plot(index1,df2["PM2.5"].iloc[0:24], color='orange', label='tiantan_aq')
ax.plot(index1,df1["PM2.5"].iloc[0:24], color='green', label='dongsi_aq')
ax.set_title("PM2.5", fontweight="bold", size=16)
ax.legend(loc="upper right")
myFmt = DateFormatter("%H: %MM")
ax.xaxis.set_major_formatter(myFmt)
df =pd.read_csv("D:/Akhila/Air pollution data set/imputed_data/aotizhongxin_aq_imputed.csv")
index1=pd.date_range(start='2017-01-01 00:00:00', end=' 2018-01-01 23:00:00',freq="H")
fig,ax = plt.subplots(figsize=(15, 8))
ax.plot(index1,df["PM2.5"].iloc[0:8784],color='r',label="PM2.5")
ax.set_title("PM2.5", fontweight="bold", size=16)
ax.legend(loc="upper right")
myFmt = DateFormatter("%Y- %m")
ax.xaxis.set_major_formatter(myFmt)
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
%matplotlib inline
lat = 'latitude'
lng = 'longitude'
sid = 'station_id'
k_nn = 3
train_data_dict = {}
test_data_dict = {}
# return the dictionary dataframes according to the air quality type for further training & testing
# drop the NaN
def airQualTypesDataDict(data_df):
# dictionary for storing dataframes
data_dict = {}
# air quality types list
air_types = ['PM2.5', 'PM10', 'NO2', 'CO', 'O3', 'SO2']
# features columns containing 'time' label
feature_cols = list(set(data_df.columns) - set(air_types))
for air in air_types:
# get all the columns
data_cols = feature_cols.copy()
data_cols.append(air)
# get the dataframe with the specific air type
air_data_df = data_df[data_cols]
# drop NaN rows
print("{} rows before dropping NaN: {}".format(air, len(air_data_df)))
air_data_df = air_data_df.dropna()
print("{} rows after dropping NaN: {}".format(air, len(air_data_df)))
# choose the 'time' as the index
air_data_df.time = air_data_df.time.apply(lambda x: pd.Timestamp(x))
# air_data_df = air_data_df.set_index('time')
data_dict[air] = air_data_df
return data_dict
# return training & validation data
def splitTrainValData(data_df, label, val_size):
label_col = [label]
feature_col = list(set(data_df.columns) - set([label]))
features_scaler = MinMaxScaler(feature_range=(0, 1), copy=True)
labels_scaler = PowerTransformer(copy=True, method='box-cox', standardize=True)
# scale features
features = data_df[feature_col]
features_scaled = features_scaler.fit_transform(features)
features_scaled_df = pd.DataFrame(features_scaled, index=features.index, columns=features.columns)
# scale labels
labels = data_df[label_col]
labels_scaled = labels_scaler.fit_transform(labels)
labels_scaled_df = pd.DataFrame(labels_scaled, index=labels.index, columns=labels.columns)
# split features
X_val = features_scaled_df[-val_size:]
X_train = features_scaled_df[:-val_size]
# split labels
y_val = labels_scaled_df[-val_size:]
y_train = labels_scaled_df[:-val_size]
val_df = data_df[-val_size:]
X_val = val_df[feature_col]
y_val = val_df[label_col]
# train set
train_df = data_df[:-val_size]
X_train = train_df[feature_col]
y_train = train_df[label_col]
return X_train, X_val, y_train, y_val, labels_scaler
# drop the NaN in the concated air types table
def getTrainTestDataDropna(data_df):
# dropna
#print ('Total rows before dropna: {}'.format(len(data_df)))
data_df = data_df.dropna()
data_df = data_df.set_index('time')
# print ('Total rows after dropna: {}'.format(len(data_df)))
# labels columns
label_cols = ['PM2.5']
feature_cols = list(set(data_df.columns) - set(label_cols))
# validation set
val_df = data_df[:]
X_val = val_df[feature_cols]
y_val = val_df[label_cols]
# train set
train_df = data_df[:]
X_train = train_df[feature_cols]
y_train = train_df[label_cols]
return X_train, X_val, y_train, y_val
X_train, X_val, y_train, y_test = getTrainTestDataDropna(atzx_impute)
X_val
X_val.shape
X_val.iloc[0:1]
| SO2 | 1_beijing_grid_283_pre | O3 | 2_hadian_meo_wea_Dust | NO2 | 0_beijing_grid_304_wd | 1_beijing_grid_283_ws | 2_hadian_meo_hum | 2_hadian_meo_tem | 2_hadian_meo_ws | ... | 2_hadian_meo_wea_Sunny/clear | 2_hadian_meo_pre | 1_beijing_grid_283_hum | 2_hadian_meo_wd | 2_hadian_meo_wea_Rain | 2_hadian_meo_wea_Haze | CO | PM10 | 2_hadian_meo_wea_Sand | 0_beijing_grid_304_hum | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| time | |||||||||||||||||||||
| 2017-01-30 16:00:00 | 34.0 | 1021.89 | 79.0 | 0 | 36.0 | 195.46 | 6.24 | 14.0 | -1.6 | 2.5 | ... | 1 | 1026.1 | 14.07 | 231.0 | 0 | 0 | 0.9 | 75.0 | 0 | 14.58 |
1 rows × 27 columns
#Correlation map to see how features are correlated
corrmat = atzx.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)
<AxesSubplot:>
import os
# read files from directory
cleaned_data_path = 'D:/Akhila/Air pollution data set/imputed_data/'
aq_file_list = []
# read the file names into list
for aq_file in os.listdir(cleaned_data_path):
if '.csv' in aq_file:
aq_file_list.append(aq_file)
atzx_path = "D:/Akhila/Air pollution data set/imputed_data/aotizhongxin_aq_imputed.csv"
atzx_impute = pd.read_csv(atzx_path, index_col=0)
atzx_impute.head()
| time | PM2.5 | PM10 | NO2 | CO | O3 | SO2 | 0_beijing_grid_304_hum | 0_beijing_grid_304_ws | 0_beijing_grid_304_wd | ... | 2_hadian_meo_pre | 2_hadian_meo_ws | 2_hadian_meo_hum | 2_hadian_meo_wea_Dust | 2_hadian_meo_wea_Fog | 2_hadian_meo_wea_Haze | 2_hadian_meo_wea_Rain | 2_hadian_meo_wea_Sand | 2_hadian_meo_wea_Snow | 2_hadian_meo_wea_Sunny/clear | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2017-01-30 16:00:00 | 70.0 | 75.0 | 36.0 | 0.9 | 79.0 | 34.0 | 14.58 | 7.13 | 195.46 | ... | 1026.1 | 2.5 | 14.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 1 | 2017-01-30 17:00:00 | 78.0 | 86.0 | 36.0 | 0.1 | 78.0 | 38.0 | 15.11 | 6.02 | 191.22 | ... | 1025.9 | 1.9 | 16.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 2 | 2017-01-30 18:00:00 | 86.0 | 92.0 | 39.0 | 1.1 | 74.0 | 35.0 | 15.64 | 4.96 | 185.11 | ... | 1025.6 | 1.2 | 18.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 3 | 2017-01-30 19:00:00 | 95.0 | 66.0 | 46.0 | 1.2 | 65.0 | 34.0 | 16.59 | 4.55 | 181.28 | ... | 1025.2 | 1.1 | 20.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 4 | 2017-01-30 20:00:00 | 98.0 | 66.0 | 42.0 | 1.2 | 50.0 | 32.0 | 17.55 | 4.17 | 176.73 | ... | 1024.3 | 0.0 | 23.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
5 rows × 29 columns
from datetime import datetime
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
def showLineChart(data_df, air_type, start, end):
data_df['time'] = pd.to_datetime(data_df['time'])
mask = (data_df['time'] > start) & (data_df['time'] <= end)
period_data = data_df.loc[mask]
period_data.head()
fig, ax = plt.subplots(figsize=(30,7))
ax.plot(period_data['time'], period_data[air_type])
myFmt = DateFormatter("%m-%d")
ax.xaxis.set_major_formatter(myFmt)
## Rotate date labels automatically
fig.autofmt_xdate()
plt.show()
from datetime import datetime
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
start = pd.Timestamp(2017, 4, 15, 0)
end = pd.Timestamp(2017, 5, 15, 23)
showLineChart(atzx_impute, 'PM2.5', start, end)
import pandas as pd
from sklearn.metrics import mean_squared_error
import sklearn.metrics as sm
import lightgbm as lgb
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_val, y_test, reference=lgb_train)
params = { 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': {'l2', 'l1'},
'num_leaves': 40, 'learning_rate': 0.07, 'feature_fraction': 0.9,
'bagging_fraction': 0.8, 'bagging_freq': 5, 'verbose': 0}
# train
print("Starting training...")
gbm = lgb.train(params, lgb_train,
num_boost_round=20,
valid_sets=lgb_eval,
early_stopping_rounds=5)
print('Saving model...')
# save model to file
gbm.save_model('model.txt')
print('Starting predicting...')
# predict
y_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
# eval
print('The rmse:', mean_squared_error(y_test, y_pred) ** 0.5)
print("Mean absolute error =", round(sm.mean_absolute_error(y_test, y_pred), 2))
print("Mean squared error =", round(sm.mean_squared_error(y_test, y_pred), 2))
print("Median absolute error =", round(sm.median_absolute_error(y_test, y_pred), 2))
print("R2 score =", round(sm.r2_score(y_test, y_pred), 2))
Starting training... [LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000956 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [1] valid_0's l1: 33.681 valid_0's l2: 2380.13 Training until validation scores don't improve for 5 rounds [2] valid_0's l1: 31.7153 valid_0's l2: 2115.23 [3] valid_0's l1: 30.074 valid_0's l2: 1910.76 [4] valid_0's l1: 28.4031 valid_0's l2: 1705.78 [5] valid_0's l1: 26.8653 valid_0's l2: 1527.82 [6] valid_0's l1: 25.4502 valid_0's l2: 1371.91 [7] valid_0's l1: 24.167 valid_0's l2: 1238.96 [8] valid_0's l1: 23.0422 valid_0's l2: 1132.89 [9] valid_0's l1: 21.9197 valid_0's l2: 1025.91 [10] valid_0's l1: 20.8777 valid_0's l2: 933.728 [11] valid_0's l1: 19.9106 valid_0's l2: 851.698 [12] valid_0's l1: 19.0273 valid_0's l2: 779.435 [13] valid_0's l1: 18.2087 valid_0's l2: 716.624 [14] valid_0's l1: 17.4568 valid_0's l2: 662.336 [15] valid_0's l1: 16.7573 valid_0's l2: 614.476 [16] valid_0's l1: 16.1428 valid_0's l2: 572.99 [17] valid_0's l1: 15.5695 valid_0's l2: 536.702 [18] valid_0's l1: 15.026 valid_0's l2: 504.66 [19] valid_0's l1: 14.5134 valid_0's l2: 474.895 [20] valid_0's l1: 14.0518 valid_0's l2: 446.259 Did not meet early stopping. Best iteration is: [20] valid_0's l1: 14.0518 valid_0's l2: 446.259 Saving model... Starting predicting... The rmse: 21.124842728082385 Mean absolute error = 14.05 Mean squared error = 446.26 Median absolute error = 10.73 R2 score = 0.83
import datetime
index=pd.date_range(start='2017-01-30 16:00:00', end='2018-05-02 23:00:00',freq="H")
fig, ax = plt.subplots(figsize=(15, 8))
ax.plot(index, y_test, color='b', label='True')
ax.plot(index,y_pred, color='orange', label='Prediction')
ax.set_title("PM2.5", fontweight="bold", size=16)
ax.legend(loc="upper right")
myFmt = DateFormatter("%m %d %H")
ax.xaxis.set_major_formatter(myFmt)
import xgboost as xgb
from xgboost import XGBRegressor
model = xgb.XGBRegressor(learning_rate = 0.013)
model.fit(X_train,y_train)
y_pred=model.predict(X_val)
print('The rmse:', mean_squared_error(y_test, y_pred) ** 0.5)
print("Mean absolute error =", round(sm.mean_absolute_error(y_test, y_pred), 2))
print("Mean squared error =", round(sm.mean_squared_error(y_test, y_pred), 2))
print("Median absolute error =", round(sm.median_absolute_error(y_test, y_pred), 2))
print("R2 score =", round(sm.r2_score(y_test, y_pred), 2))
The rmse: 26.149523342983592 Mean absolute error = 16.3 Mean squared error = 683.8 Median absolute error = 8.2 R2 score = 0.74
importance = model.feature_importances_
plt.bar([x for x in range(len(importance))], importance)
plt.show()
fig, ax = plt.subplots(figsize=(15, 8))
ax.plot(index, y_test, color='b', label='True')
ax.plot(index,y_pred, color='orange', label='Prediction')
ax.set_title("PM2.5", fontweight="bold", size=16)
ax.legend(loc="upper right")
myFmt = DateFormatter("%m %d %H")
ax.xaxis.set_major_formatter(myFmt)
from sklearn.ensemble import AdaBoostRegressor
model = AdaBoostRegressor(learning_rate=0.005)
model.fit(X_train,y_train)
y_pred = model.predict(X_val)
print('The rmse:', mean_squared_error(y_test, y_pred) ** 0.5)
print("Mean absolute error =", round(sm.mean_absolute_error(y_test, y_pred), 2))
print("Mean squared error =", round(sm.mean_squared_error(y_test, y_pred), 2))
print("Median absolute error =", round(sm.median_absolute_error(y_test, y_pred), 2))
print("R2 score =", round(sm.r2_score(y_test, y_pred), 2))
The rmse: 27.063878770180615 Mean absolute error = 18.46 Mean squared error = 732.45 Median absolute error = 10.41 R2 score = 0.72
y_prediction=model.predict([[2017-1-30,0.9,0,-5.98,1026.1,-1.6,2.5,14.58,-5.89,14.07,7.13,195.46,195.46,195.46,195.46,195.46,195.46,195.46,195.46,
231.0,0,14.0,79.0,6.24,195.46,201.74,0]])
y_prediction
array([108.08823529])
fig, ax = plt.subplots(figsize=(15, 8))
ax.plot(index, y_test, color='b', label='True')
ax.plot(index,y_pred, color='orange', label='Prediction')
ax.set_title("PM2.5", fontweight="bold", size=16)
ax.legend(loc="upper right")
myFmt = DateFormatter("%m %d %H")
ax.xaxis.set_major_formatter(myFmt)